InΒ [1]:
"""
Professional Network Analysis: Keyword Co-occurrence Study
=========================================================

Author: Rosalina Torres
Title: "The Science Behind the Art"

This module provides comprehensive network analysis tools for examining keyword 
co-occurrence patterns in large datasets. It generates multiple visualization 
perspectives to reveal community structures, importance rankings, and connection patterns.

Dependencies:
- pandas, networkx, matplotlib, numpy, seaborn
- Optional: python-louvain (for community detection)
"""

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import warnings
import time

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

class NetworkAnalyzer:
    """
    A comprehensive network analysis toolkit for keyword co-occurrence data.
    
    This class handles data loading, network construction, and generates multiple
    analytical visualizations to understand keyword relationships and importance.
    """
    
    def __init__(self, data_path=None):
        """
        Initialize the NetworkAnalyzer.
        
        Args:
            data_path (str, optional): Path to co-occurrence matrix CSV file
        """
        self.data_path = data_path
        self.co_occurrence_matrix = None
        self.network = None
        self.logger = None
        
    def find_data_file(self):
        """
        Locate the co-occurrence matrix file from common paths.
        
        Returns:
            str or None: Path to the data file if found, None otherwise
        """
        if self.data_path and os.path.exists(self.data_path):
            return self.data_path
            
        common_paths = [
            "co_occurrence_matrix.csv",
            os.path.expanduser("~/Downloads/co_occurrence_matrix.csv"),
            os.path.expanduser("~/Desktop/co_occurrence_matrix.csv")
        ]
        
        for path in common_paths:
            if os.path.exists(path):
                print(f"βœ… Found data file: {path}")
                return path
                
        print("⚠️ No data file found in common locations")
        return None
    
    def load_data(self):
        """
        Load and clean the co-occurrence matrix data.
        
        Returns:
            bool: True if data loaded successfully, False otherwise
        """
        file_path = self.find_data_file()
        
        if not file_path:
            print("❌ Cannot proceed without data file")
            return False
            
        try:
            self.co_occurrence_matrix = pd.read_csv(file_path, index_col=0)
            print(f"πŸ“Š Loaded dataset: {self.co_occurrence_matrix.shape}")
            
            # Clean keyword names
            self.co_occurrence_matrix.columns = self.co_occurrence_matrix.columns.map(self._clean_keyword)
            self.co_occurrence_matrix.index = self.co_occurrence_matrix.index.map(self._clean_keyword)
            
            # Remove empty keywords
            self.co_occurrence_matrix = self.co_occurrence_matrix.loc[
                self.co_occurrence_matrix.index != '', 
                self.co_occurrence_matrix.columns != ''
            ]
            
            print(f"🧹 Cleaned data: {self.co_occurrence_matrix.shape}")
            return True
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return False
    
    def _clean_keyword(self, keyword):
        """Clean and standardize keyword strings."""
        if pd.isna(keyword):
            return ""
        cleaned = " ".join(str(keyword).replace("--", " ").split())
        return cleaned.strip().lower()
    
    def build_network(self, min_weight=1):
        """
        Build a weighted network from the co-occurrence matrix.
        
        Args:
            min_weight (int): Minimum edge weight to include in network
            
        Returns:
            bool: True if network built successfully, False otherwise
        """
        if self.co_occurrence_matrix is None:
            print("❌ No data loaded. Call load_data() first.")
            return False
            
        print(f"πŸ”¨ Building network (min_weight={min_weight})...")
        
        self.network = nx.Graph()
        
        # Add edges based on co-occurrence weights
        for word1 in self.co_occurrence_matrix.index:
            for word2 in self.co_occurrence_matrix.columns:
                if word1 != word2:
                    weight = self.co_occurrence_matrix.at[word1, word2]
                    if pd.notna(weight) and weight >= min_weight:
                        self.network.add_edge(word1, word2, weight=weight)
        
        # Remove isolated nodes
        isolated_nodes = list(nx.isolates(self.network))
        self.network.remove_nodes_from(isolated_nodes)
        
        print(f"βœ… Network built: {self.network.number_of_nodes()} nodes, {self.network.number_of_edges()} edges")
        return True
    
    def analyze_network_properties(self):
        """
        Calculate and display key network properties.
        
        Returns:
            dict: Dictionary containing network metrics
        """
        if self.network is None:
            print("❌ No network available. Build network first.")
            return {}
            
        print("\nπŸ“Š NETWORK ANALYSIS RESULTS")
        print("=" * 50)
        
        metrics = {
            'nodes': self.network.number_of_nodes(),
            'edges': self.network.number_of_edges(),
            'density': nx.density(self.network),
            'avg_degree': sum(dict(self.network.degree()).values()) / self.network.number_of_nodes(),
            'components': nx.number_connected_components(self.network)
        }
        
        print(f"Total Keywords (Nodes): {metrics['nodes']}")
        print(f"Total Connections (Edges): {metrics['edges']}")
        print(f"Network Density: {metrics['density']:.4f}")
        print(f"Average Degree: {metrics['avg_degree']:.2f}")
        print(f"Connected Components: {metrics['components']}")
        
        # Calculate centrality measures
        try:
            print("\nπŸ” Calculating importance metrics...")
            degree_centrality = nx.degree_centrality(self.network)
            pagerank = nx.pagerank(self.network, weight='weight', max_iter=100)
            
            # Top 10 most important keywords
            top_keywords = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
            print("\nπŸ† TOP 10 MOST IMPORTANT KEYWORDS:")
            for i, (keyword, score) in enumerate(top_keywords, 1):
                print(f"{i:2d}. {keyword.title():<25} (Score: {score:.4f})")
                
            metrics['top_keywords'] = top_keywords
            
        except Exception as e:
            print(f"⚠️ Error calculating centrality: {e}")
            
        return metrics
    
    def create_community_overview(self, figsize=(20, 16)):
        """Create community structure visualization."""
        if self.network is None:
            return
            
        print("🎨 Creating Community Structure Overview...")
        
        degree_centrality = nx.degree_centrality(self.network)
        
        # Try community detection
        try:
            import community as community_louvain
            communities = community_louvain.best_partition(self.network, weight='weight')
            node_colors = [communities[node] for node in self.network.nodes()]
            cmap = plt.cm.tab20
            title_extra = f" ({len(set(communities.values()))} Communities)"
        except ImportError:
            node_colors = [degree_centrality[node] for node in self.network.nodes()]
            cmap = plt.cm.viridis
            title_extra = " (Degree-based Coloring)"
        
        pos = nx.spring_layout(self.network, k=3, iterations=30, weight='weight')
        node_sizes = [50 + degree_centrality[node] * 500 for node in self.network.nodes()]
        
        plt.figure(figsize=figsize, facecolor='white')
        nx.draw(self.network, pos, node_size=node_sizes, node_color=node_colors,
               cmap=cmap, alpha=0.8, linewidths=1, edgecolors='white',
               edge_color='gray', width=0.3)
        
        plt.title(f'Keyword Network Community Structure{title_extra}\n'
                 f'{self.network.number_of_nodes()} Keywords, {self.network.number_of_edges()} Connections', 
                 fontsize=20, fontweight='bold', pad=20)
        
        # Add network statistics
        stats_text = f"""Network Statistics:
        Density: {nx.density(self.network):.4f}
        Avg Degree: {sum(dict(self.network.degree()).values()) / self.network.number_of_nodes():.1f}
        Components: {nx.number_connected_components(self.network)}"""
        
        plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
                fontsize=12, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
        
        plt.axis('off')
        plt.tight_layout()
        plt.show()
    
    def create_importance_analysis(self, figsize=(20, 16)):
        """Create PageRank importance visualization."""
        if self.network is None:
            return
            
        print("🎨 Creating Importance Analysis...")
        
        try:
            pagerank = nx.pagerank(self.network, weight='weight', max_iter=100)
        except:
            pagerank = {node: 1/self.network.number_of_nodes() for node in self.network.nodes()}
        
        pos = nx.spring_layout(self.network, k=3, iterations=30, weight='weight')
        node_sizes = [50 + pagerank[node] * 2000 for node in self.network.nodes()]
        node_colors = [pagerank[node] for node in self.network.nodes()]
        
        plt.figure(figsize=figsize, facecolor='white')
        
        # Draw edges
        nx.draw_networkx_edges(self.network, pos, edge_color='lightgray', width=0.3, alpha=0.5)
        
        # Draw nodes with colorbar
        nodes = nx.draw_networkx_nodes(self.network, pos, node_size=node_sizes, 
                                      node_color=node_colors, cmap=plt.cm.plasma, 
                                      alpha=0.8, linewidths=1, edgecolors='white')
        
        plt.colorbar(nodes, label='Importance Score', shrink=0.8)
        plt.title('Keyword Importance Analysis\nNode Size & Color = Importance Score', 
                 fontsize=20, fontweight='bold', pad=20)
        
        # Show top keywords
        top_keywords = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
        top_text = "Top 10 Keywords:\n" + "\n".join([
            f"{i+1}. {node.title()}" for i, (node, _) in enumerate(top_keywords)
        ])
        
        plt.text(0.02, 0.02, top_text, transform=plt.gca().transAxes, 
                fontsize=10, verticalalignment='bottom',
                bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.9))
        
        plt.axis('off')
        plt.tight_layout()
        plt.show()
    
    def create_hub_analysis(self, figsize=(20, 16), top_n=25):
        """Create hub keywords visualization with labels."""
        if self.network is None:
            return
            
        print(f"🎨 Creating Hub Analysis (Top {top_n} Keywords)...")
        
        try:
            pagerank = nx.pagerank(self.network, weight='weight', max_iter=100)
            degree_centrality = nx.degree_centrality(self.network)
        except:
            pagerank = {node: 1 for node in self.network.nodes()}
            degree_centrality = {node: 1 for node in self.network.nodes()}
        
        # Get top N most important nodes
        top_nodes = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:top_n]
        hub_network = self.network.subgraph([node for node, _ in top_nodes]).copy()
        
        if len(hub_network.nodes()) == 0:
            print("❌ No hub nodes found")
            return
        
        pos = nx.spring_layout(hub_network, k=5, iterations=50, weight='weight')
        node_sizes = [200 + pagerank[node] * 1500 for node in hub_network.nodes()]
        node_colors = [degree_centrality[node] for node in hub_network.nodes()]
        
        plt.figure(figsize=figsize, facecolor='white')
        nx.draw(hub_network, pos, node_size=node_sizes, node_color=node_colors,
               cmap=plt.cm.coolwarm, alpha=0.9, linewidths=2, edgecolors='black',
               edge_color='darkblue', width=2, with_labels=True, 
               font_size=11, font_weight='bold', font_color='darkblue')
        
        plt.title(f'Top {top_n} Hub Keywords Network\nWith Connection Patterns', 
                 fontsize=20, fontweight='bold', pad=20)
        
        # Add ranking list
        ranking_text = f"Importance Rankings:\n" + "\n".join([
            f"{i+1:2d}. {node.title()}: {score:.4f}" 
            for i, (node, score) in enumerate(top_nodes[:15])
        ])
        
        plt.text(0.02, 0.98, ranking_text, transform=plt.gca().transAxes, 
                fontsize=10, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='lightcyan', alpha=0.9))
        
        plt.axis('off')
        plt.tight_layout()
        plt.show()
    
    def create_strong_connections(self, figsize=(20, 16), percentile=75):
        """Create strongest connections visualization."""
        if self.network is None:
            return
            
        print(f"🎨 Creating Strongest Connections (Top {100-percentile}%)...")
        
        edges = list(self.network.edges(data=True))
        if len(edges) == 0:
            print("❌ No edges found")
            return
        
        weights = [d['weight'] for u, v, d in edges]
        threshold = np.percentile(weights, percentile)
        
        strong_edges = [(u, v) for u, v, d in edges if d['weight'] >= threshold]
        strong_network = self.network.edge_subgraph(strong_edges).copy()
        
        if len(strong_network.nodes()) == 0:
            print("❌ No strong connections found")
            return
        
        print(f"   Showing {len(strong_network.edges())} strongest connections out of {len(edges)} total")
        
        pos = nx.spring_layout(strong_network, k=4, iterations=50, weight='weight')
        
        plt.figure(figsize=figsize, facecolor='white')
        
        # Calculate edge widths and node sizes
        edge_weights = [strong_network[u][v]['weight'] for u, v in strong_network.edges()]
        max_weight, min_weight = max(edge_weights), min(edge_weights)
        edge_widths = [(w - min_weight) / (max_weight - min_weight) * 8 + 1 for w in edge_weights]
        
        node_degrees = dict(strong_network.degree())
        node_sizes = [100 + node_degrees[node] * 50 for node in strong_network.nodes()]
        
        # Draw network
        nx.draw_networkx_edges(strong_network, pos, width=edge_widths, 
                              edge_color='red', alpha=0.7)
        nx.draw_networkx_nodes(strong_network, pos, node_size=node_sizes, 
                              node_color='lightblue', alpha=0.9, 
                              linewidths=2, edgecolors='navy')
        
        # Label high-degree nodes
        high_degree_nodes = {
            node: node for node in strong_network.nodes() 
            if node_degrees[node] >= np.percentile(list(node_degrees.values()), 70)
        }
        
        if high_degree_nodes:
            nx.draw_networkx_labels(strong_network, pos, labels=high_degree_nodes,
                                   font_size=10, font_weight='bold', font_color='darkblue')
        
        plt.title(f'Strongest Keyword Connections\nTop {100-percentile}% of Connections (Threshold: {threshold:.1f})', 
                 fontsize=20, fontweight='bold', pad=20)
        
        # Add statistics
        stats_text = f"""Connection Statistics:
        Strongest: {max_weight:.1f}
        Weakest shown: {min_weight:.1f}
        Average: {np.mean(edge_weights):.1f}
        Total connections: {len(strong_network.edges())}"""
        
        plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
                fontsize=12, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='lightpink', alpha=0.8))
        
        plt.axis('off')
        plt.tight_layout()
        plt.show()
    
    def run_complete_analysis(self):
        """
        Execute complete network analysis pipeline.
        
        This method runs the full analysis including data loading, network construction,
        and generation of all four visualization perspectives.
        """
        print("πŸš€ COMPREHENSIVE KEYWORD NETWORK ANALYSIS")
        print("=" * 60)
        
        # Load and prepare data
        if not self.load_data():
            return False
        
        # Build network
        if not self.build_network():
            return False
        
        # Analyze network properties
        metrics = self.analyze_network_properties()
        
        # Generate visualizations
        print(f"\n🎨 Creating comprehensive visualizations...")
        print("=" * 50)
        
        print("\n1️⃣ COMMUNITY STRUCTURE OVERVIEW")
        self.create_community_overview()
        
        print("\n2️⃣ KEYWORD IMPORTANCE ANALYSIS") 
        self.create_importance_analysis()
        
        print("\n3️⃣ HUB KEYWORDS ANALYSIS")
        self.create_hub_analysis()
        
        print("\n4️⃣ STRONGEST CONNECTIONS NETWORK")
        self.create_strong_connections()
        
        print(f"\nβœ… ANALYSIS COMPLETE!")
        print(f"Generated 4 comprehensive network visualizations")
        print(f"Network contains {metrics.get('nodes', 0)} keywords with {metrics.get('edges', 0)} connections")
        
        return True


def main():
    """
    Main execution function for keyword network analysis.
    
    Usage:
        analyzer = NetworkAnalyzer('path/to/co_occurrence_matrix.csv')
        analyzer.run_complete_analysis()
    """
    # Initialize analyzer
    analyzer = NetworkAnalyzer()
    
    # Run complete analysis
    success = analyzer.run_complete_analysis()
    
    if success:
        print("\nπŸ“Š ANALYSIS SUMMARY")
        print("=" * 30)
        print("βœ… Data successfully loaded and analyzed")
        print("βœ… Network structure revealed")
        print("βœ… Key insights identified")
        print("βœ… Visualizations generated")
        print("\nRefer to the generated plots for detailed insights into")
        print("keyword relationships, community structures, and importance rankings.")
    else:
        print("\n❌ Analysis failed. Please check your data file and try again.")


if __name__ == "__main__":
    main()
πŸš€ COMPREHENSIVE KEYWORD NETWORK ANALYSIS
============================================================
βœ… Found data file: co_occurrence_matrix.csv
πŸ“Š Loaded dataset: (276, 276)
🧹 Cleaned data: (276, 276)
πŸ”¨ Building network (min_weight=1)...
βœ… Network built: 276 nodes, 5115 edges

πŸ“Š NETWORK ANALYSIS RESULTS
==================================================
Total Keywords (Nodes): 276
Total Connections (Edges): 5115
Network Density: 0.1348
Average Degree: 37.07
Connected Components: 1

πŸ” Calculating importance metrics...

πŸ† TOP 10 MOST IMPORTANT KEYWORDS:
 1. Management                (Score: 0.0646)
 2. Organizational            (Score: 0.0569)
 3. Behavior                  (Score: 0.0229)
 4. Business                  (Score: 0.0207)
 5. Industrial                (Score: 0.0195)
 6. Relations                 (Score: 0.0153)
 7. Psychology                (Score: 0.0146)
 8. Decision                  (Score: 0.0143)
 9. Making                    (Score: 0.0135)
10. Personnel                 (Score: 0.0135)

🎨 Creating comprehensive visualizations...
==================================================

1️⃣ COMMUNITY STRUCTURE OVERVIEW
🎨 Creating Community Structure Overview...
No description has been provided for this image
2️⃣ KEYWORD IMPORTANCE ANALYSIS
🎨 Creating Importance Analysis...
No description has been provided for this image
3️⃣ HUB KEYWORDS ANALYSIS
🎨 Creating Hub Analysis (Top 25 Keywords)...
No description has been provided for this image
4️⃣ STRONGEST CONNECTIONS NETWORK
🎨 Creating Strongest Connections (Top 25%)...
   Showing 1952 strongest connections out of 5115 total
No description has been provided for this image
βœ… ANALYSIS COMPLETE!
Generated 4 comprehensive network visualizations
Network contains 276 keywords with 5115 connections

πŸ“Š ANALYSIS SUMMARY
==============================
βœ… Data successfully loaded and analyzed
βœ… Network structure revealed
βœ… Key insights identified
βœ… Visualizations generated

Refer to the generated plots for detailed insights into
keyword relationships, community structures, and importance rankings.
InΒ [Β ]: